In [2]:
import struct
import os
import numpy as np
import pandas as pd
import random
import plotly.graph_objects as go
import math
In [7]:
COMP_FILE = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng/sample_embeddings_001825.comparisons"
COMP_FILE_Q001 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_001825.comparisons"
COMP_FILE_Q001_04 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01_margin0.4/sample_embeddings_q0.01_margin0.4_002281.comparisons"
COMP_FILE_Q0001 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_002281.comparisons"
COMP_FILE_Q001_LSTM40_3LAYERS = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_lstm40_3layers_002281.comparisons"

COMP_FILE_Q001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_big_002281.comparisons"
COMP_FILE_Q0001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.4_002281.comparisons"
COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05 = "/media/eduseiti/Seagate Expansion Drive/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.5_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048 = "/mnt/f633ac7c-3153-4566-a009-229a0ae5f8a1/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001_margin0.48/sample_embeddings_q0.001_big_lstm40_3layer_margin0.48_002281.comparisons"

BASE_PVALUE_FOLDER="/media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue"
COMP_ALL_PVALUE_10="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_002281.comparisons"
COMP_ALL_PVALUE_10_LOG_SCALING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_log_scaling_002281.comparisons"

COMP_ALL_PVALUE_10_TEST="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_test_002281.comparisons"

COMP_ALL_PVALUE_10_WINSORIZING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_winsorizing_002281.comparisons"

COMP_ALL_PVALUE_10_IDENTIFICATIONS_FIX="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_identifications_fix_002281.comparisons"

COMP_ALL_PVALUE_10_CELL_STATE="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_002281.comparisons"

COMP_ALL_PVALUE_10_CELL_STATE_NO_WINSORIZING="sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_no_winsorizing_002281.comparisons"

STRUCT_FIELDS = "BIBId"
In [4]:
def decode_comparisons_file(comparisons_filename):
    
    comparisons = []

    with open(comparisons_filename, "rb") as inputFile:
        while True:
            record = inputFile.read(struct.calcsize(STRUCT_FIELDS))

            if not record:
                break
            else:
                unpacked = struct.unpack_from(STRUCT_FIELDS, record)
                
                comparisons.append(unpacked)
                
                if math.isnan(unpacked[4]):
                    print("nan: {}".format(record))

    print("Decoded {} comparisons from {}".format(len(comparisons), comparisons_filename))
    
    return np.array(comparisons)
In [5]:
def plot_comparissons_histogram(comparisons_filename):
    
    comparisons = decode_comparisons_file(comparisons_filename)
    comparisons_df = pd.DataFrame(comparisons, columns = ["file_1", "scannr_1", "file_2", "scannr_2", "cosine_similarity"])
    
    print(comparisons_df['cosine_similarity'].describe(percentiles=list(np.round(np.arange(0.0, 1.0, 0.05), 2))))
    
    cosSim_histogram, costSim_bin_edges = np.histogram(comparisons_df['cosine_similarity'].loc[list(random.sample(range(len(comparisons)), int(len(comparisons) * 0.1)))], 1000)

    fig = go.Figure()

    fig.add_trace(go.Bar(y=cosSim_histogram,
                         x=costSim_bin_edges[1:],
                         marker_color='red'))
    
    fig.show()
    
    return comparisons_df, cosSim_histogram, costSim_bin_edges

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using cell state

In [8]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_CELL_STATE_NO_WINSORIZING))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_no_winsorizing_002281.comparisons
count    4.518103e+07
mean     5.999468e-01
std      2.083475e-01
min      0.000000e+00
0%       0.000000e+00
5%       2.247053e-01
10%      3.079672e-01
15%      3.667974e-01
20%      4.146648e-01
25%      4.561856e-01
30%      4.935206e-01
35%      5.279150e-01
40%      5.601376e-01
45%      5.907887e-01
50%      6.202609e-01
55%      6.489402e-01
60%      6.771672e-01
65%      7.052819e-01
70%      7.335853e-01
75%      7.626252e-01
80%      7.928813e-01
85%      8.253194e-01
90%      8.617017e-01
95%      9.061875e-01
max      9.998177e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, using cell state

In [6]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_CELL_STATE))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_cell_state_002281.comparisons
count    4.518103e+07
mean     7.184811e-01
std      1.881582e-01
min      0.000000e+00
0%       0.000000e+00
5%       3.442971e-01
10%      4.536700e-01
15%      5.228720e-01
20%      5.746037e-01
25%      6.164831e-01
30%      6.520939e-01
35%      6.832422e-01
40%      7.111543e-01
45%      7.365367e-01
50%      7.600418e-01
55%      7.820460e-01
60%      8.029428e-01
65%      8.230033e-01
70%      8.425386e-01
75%      8.617797e-01
80%      8.811033e-01
85%      9.009655e-01
90%      9.221926e-01
95%      9.467829e-01
max      9.998562e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, the network fix, Winsorizing and identifications fix

In [5]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_IDENTIFICATIONS_FIX))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_identifications_fix_002281.comparisons
count    4.518103e+07
mean     7.602106e-01
std      1.625749e-01
min      0.000000e+00
0%       0.000000e+00
5%       4.450447e-01
10%      5.335300e-01
15%      5.901863e-01
20%      6.333075e-01
25%      6.687284e-01
30%      6.991550e-01
35%      7.261281e-01
40%      7.505179e-01
45%      7.729450e-01
50%      7.938686e-01
55%      8.135877e-01
60%      8.323577e-01
65%      8.504048e-01
70%      8.679799e-01
75%      8.853375e-01
80%      9.027481e-01
85%      9.205054e-01
90%      9.393431e-01
95%      9.606497e-01
max      9.999760e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%, the network fix, and Winsorizing

In [8]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_WINSORIZING))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_winsorizing_002281.comparisons
count    4.518103e+07
mean     7.532969e-01
std      1.633086e-01
min      0.000000e+00
0%       0.000000e+00
5%       4.379775e-01
10%      5.244826e-01
15%      5.806675e-01
20%      6.237324e-01
25%      6.592418e-01
30%      6.898998e-01
35%      7.170859e-01
40%      7.417871e-01
45%      7.645405e-01
50%      7.857851e-01
55%      8.059172e-01
60%      8.251235e-01
65%      8.437018e-01
70%      8.618374e-01
75%      8.797735e-01
80%      8.977691e-01
85%      9.161804e-01
90%      9.356125e-01
95%      9.574835e-01
max      9.998067e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10% and the network fix

In [11]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_TEST))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_test_002281.comparisons
count    4.518103e+07
mean     7.576761e-01
std      1.720922e-01
min      0.000000e+00
0%       0.000000e+00
5%       4.140510e-01
10%      5.163339e-01
15%      5.806930e-01
20%      6.284674e-01
25%      6.669708e-01
30%      6.994428e-01
35%      7.277446e-01
40%      7.530131e-01
45%      7.760090e-01
50%      7.972385e-01
55%      8.171120e-01
60%      8.359561e-01
65%      8.540266e-01
70%      8.715339e-01
75%      8.887198e-01
80%      9.058028e-01
85%      9.231209e-01
90%      9.412685e-01
95%      9.615875e-01
max      9.999650e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10% and applying log scaling on the spectra intensities

In [9]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10_LOG_SCALING))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_log_scaling_002281.comparisons
count    4.518103e+07
mean     7.984037e-01
std      1.547374e-01
min      0.000000e+00
0%       0.000000e+00
5%       4.856370e-01
10%      5.842519e-01
15%      6.454720e-01
20%      6.901029e-01
25%      7.252265e-01
30%      7.542379e-01
35%      7.790551e-01
40%      8.009547e-01
45%      8.206643e-01
50%      8.387102e-01
55%      8.554431e-01
60%      8.711161e-01
65%      8.859417e-01
70%      9.001103e-01
75%      9.137969e-01
80%      9.271431e-01
85%      9.404106e-01
90%      9.540324e-01
95%      9.690831e-01
max      9.999127e-01
Name: cosine_similarity, dtype: float64

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and pvalue at 10%

In [5]:
_, embeddings_hist, _ = plot_comparissons_histogram(os.path.join(BASE_PVALUE_FOLDER, COMP_ALL_PVALUE_10))
Decoded 45181026 comparisons from /media/eduseiti/data_storage_1TB/unicamp/clustering_linfeng_sample_pvalues/linfeng_q0.01_pvalue/sample_embeddings_q0.01_all_lstm40_3layer_pvalue_0.1_002281.comparisons
count    4.518103e+07
mean     7.786209e-01
std      1.598344e-01
min      0.000000e+00
0%       0.000000e+00
5%       4.697826e-01
10%      5.575931e-01
15%      6.130364e-01
20%      6.548875e-01
25%      6.892336e-01
30%      7.187339e-01
35%      7.449659e-01
40%      7.688437e-01
45%      7.909842e-01
50%      8.117289e-01
55%      8.314510e-01
60%      8.503471e-01
65%      8.686012e-01
70%      8.863679e-01
75%      9.036766e-01
80%      9.206150e-01
85%      9.372640e-01
90%      9.539052e-01
95%      9.712818e-01
max      9.999816e-01
Name: cosine_similarity, dtype: float64
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

Similarities sample (10%) histogram of clustering using all distances

In [ ]:
_, embeddings_hist, _ = plot_comparissons_histogram(COMP_FILE)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01

In [ ]:
_, embeddings_q001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001

In [ ]:
_, embeddings_q0001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and margin 0.4

In [ ]:
_, embeddings_q001_04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_04)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and a LSTM 40 3 layers model

In [ ]:
_, embeddings_q001_lstm40_3layers_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_LSTM40_3LAYERS)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 with bigger training dataset

In [ ]:
_, embeddings_q001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_BIG)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset

In [ ]:
_, embeddings_q0001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.4

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_margin04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.5

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_margin05_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05)

Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.48

In [ ]:
_, embeddings_q0001_big_lstm40_3layer_margin048_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048)
In [ ]: